Call Libraries
library(tidyverse)
library(car)
library(moments)
library(glmnet)
Calling the Transformed Datasets
income_cleaned = read_csv('Shiny_app/data/income_cleaned.csv')
industry_cleaned = read_csv('Shiny_app/data/industry_cleaned.csv')
Creating the Models
sat.model.summary <- function (df, field, sat.formula){
#Shapiro-Wilks test to evaluate normality
print(shapiro.test(df[[field]]))
#Kurtosis evaluation (normal distribution has a value close to 3)
print('kurtosis')
print(kurtosis(df[[field]]))
linear.model.cleaned = lm(sat.formula, data = df)
print(summary(linear.model.cleaned))
plot(linear.model.cleaned)
#histograms of response variable to check distribution
print(df %>%
ggplot(aes_string(field)) +
geom_histogram() +
labs(title = 'Average Credit Amount Distribution') +
theme(plot.title = element_text(hjust = 0.5)))
#Checking multicollinearity using VIF measurement
print(vif(linear.model.cleaned))
influencePlot(linear.model.cleaned)
#avPlots(linear.model.cleaned)
}
sat.formula <- Avg ~ .
sat.field <- 'Avg'
sat.model.summary(income_cleaned, sat.field, sat.formula)
Shapiro-Wilk normality test
data: df[[field]]
W = 0.17297, p-value < 2.2e-16
[1] "kurtosis"
[1] 172.7518
Call:
lm(formula = sat.formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-11338570 -547702 94139 421302 87181761
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -1.157e+08 5.512e+07 -2.100 0.03590 *
Year 5.721e+04 2.731e+04 2.095 0.03634 *
NameAlternative Fuels and Electric Vehicle Recharging Property Credit -3.287e+05 1.242e+06 -0.265 0.79140
NameAlternative Minimum Tax Credit 6.538e+05 9.757e+05 0.670 0.50286
NameBeer Production Credit 3.316e+05 1.290e+06 0.257 0.79715
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 6/23/08 but before 7/1/15 1.429e+06 9.960e+05 1.434 0.15162
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 7/1/15 4.098e+06 1.421e+06 2.883 0.00398 **
NameBrownfield Tax Credits - Redevelopment Tax Credit - Prior to 6/23/08 1.182e+06 9.927e+05 1.191 0.23386
NameBrownfield Tax Credits - Remediation Real Property Tax Credit -7.587e+04 9.920e+05 -0.076 0.93904
NameClean Heating Fuel Credit 7.148e+04 1.024e+06 0.070 0.94436
NameConservation Easement Tax Credit 1.143e+05 1.064e+06 0.107 0.91444
NameCredit for Employment of Persons with Disabilities -9.343e+05 1.119e+06 -0.835 0.40391
NameCredit for Purchase of an Automated External Defibrillator -1.449e+05 9.695e+05 -0.150 0.88117
NameCredit for Taxicabs & Livery Service Vehicles Accessible to Persons with Disabilities 2.316e+05 1.779e+06 0.130 0.89645
NameEmpire State Apprentice Tax Credit -7.983e+05 1.524e+06 -0.524 0.60040
NameEmpire State Commercial Production Credit 2.183e+05 1.291e+06 0.169 0.86576
NameEmpire State Film Post Production Credit 2.521e+04 1.049e+06 0.024 0.98082
NameEmpire State Film Production Credit 1.145e+07 9.967e+05 11.485 < 2e-16 ***
NameEmpire State Musical and Theatrical Production Credit -2.342e+05 1.775e+06 -0.132 0.89507
NameExcelsior Jobs Program Credit 1.035e+05 9.545e+05 0.108 0.91366
NameEZ/QEZE Tax Credits - EZ Investment Tax Credit 2.752e+06 8.862e+05 3.105 0.00193 **
NameEZ/QEZE Tax Credits - EZ Wage Tax Credit 4.845e+05 9.213e+05 0.526 0.59899
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes 1.245e+06 9.239e+05 1.348 0.17785
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes For Corporate Partners -1.025e+05 9.549e+05 -0.107 0.91455
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit -6.145e+04 9.387e+05 -0.065 0.94781
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit For Corporate Partners 9.119e+04 1.044e+06 0.087 0.93038
NameFarm Workforce Retention Credit -7.802e+03 1.285e+06 -0.006 0.99516
NameFarmers' School Tax Credit 1.440e+05 1.021e+06 0.141 0.88786
NameHire a Veteran Credit -1.238e+06 1.782e+06 -0.695 0.48721
NameHistoric Properties Rehabilitation Credit 1.352e+06 1.055e+06 1.282 0.20015
NameIndustrial or Manufacturing Business Tax Credit 4.997e+05 1.074e+06 0.465 0.64173
NameInvestment Tax Credit 4.463e+05 8.895e+05 0.502 0.61588
NameInvestment Tax Credit for the Financial Services Industry 3.976e+05 1.008e+06 0.395 0.69325
NameLife Sciences Research & Development Tax Credit -8.207e+04 2.099e+06 -0.039 0.96882
NameLong-Term Care Insurance Credit 1.246e+05 9.808e+05 0.127 0.89892
NameLow-Income Housing Credit -2.764e+04 1.113e+06 -0.025 0.98020
NameMinimum Wage Reimbursement Credit -2.931e+05 1.006e+06 -0.291 0.77075
NameMortgage Servicing Tax Credit -4.540e+05 1.085e+06 -0.418 0.67565
NameNew York Youth Jobs Program Tax Credit -2.587e+05 9.381e+05 -0.276 0.78279
NameQETC Capital Tax Credit 2.616e+05 1.386e+06 0.189 0.85032
NameQETC Employment Credit 1.335e+05 1.026e+06 0.130 0.89641
NameQETC Facilities, Operations, and Training Credit 5.153e+05 1.918e+06 0.269 0.78820
NameReal Property Tax Relief Credit for Manufacturing -2.635e+05 9.654e+05 -0.273 0.78490
NameSpecial Additional Mortgage Recording Tax Credit 3.678e+04 9.244e+05 0.040 0.96827
NameSTART-UP NY Tax Elimination Credit 4.061e+03 1.130e+06 0.004 0.99713
Group1,000,000 - 24,999,999 2.170e+05 3.501e+05 0.620 0.53548
Group100,000 - 499,999 1.352e+05 3.579e+05 0.378 0.70572
Group100,000,000 - 499,999,999 9.644e+05 3.937e+05 2.449 0.01441 *
Group25,000,000 - 49,999,999 4.519e+05 4.223e+05 1.070 0.28474
Group50,000,000 - 99,999,999 4.021e+05 4.235e+05 0.950 0.34247
Group500,000 - 999,999 2.195e+05 3.880e+05 0.566 0.57170
Group500,000,000 - and over 3.073e+06 3.846e+05 7.991 2.33e-15 ***
GroupZero or Net Loss 9.536e+05 3.339e+05 2.856 0.00433 **
Num -4.520e+02 8.569e+02 -0.527 0.59791
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 3868000 on 1867 degrees of freedom
Multiple R-squared: 0.2361, Adjusted R-squared: 0.2144
F-statistic: 10.88 on 53 and 1867 DF, p-value: < 2.2e-16
GVIF Df GVIF^(1/(2*Df))
Year 2.125869 1 1.458036
Name 3.521453 43 1.014746
Group 1.510763 8 1.026124
Num 1.645484 1 1.282764
income.model <- lm(sat.formula, data = income_cleaned)
sat.model.summary(industry_cleaned, sat.field, sat.formula)
Shapiro-Wilk normality test
data: df[[field]]
W = 0.22287, p-value < 2.2e-16
[1] "kurtosis"
[1] 138.5482
Call:
lm(formula = sat.formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-11581741 -371031 -23164 154182 28917959
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.926e+07 2.051e+07 -1.914 0.055677 .
Year 1.936e+04 1.016e+04 1.905 0.056959 .
NameAlternative Fuels and Electric Vehicle Recharging Property Credit 5.584e+04 5.062e+05 0.110 0.912180
NameAlternative Minimum Tax Credit 3.523e+05 4.208e+05 0.837 0.402556
NameBeer Production Credit 8.799e+04 6.261e+05 0.141 0.888249
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 6/23/08 but before 7/1/15 1.919e+06 4.473e+05 4.290 1.86e-05 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 7/1/15 4.064e+06 7.278e+05 5.584 2.62e-08 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - Prior to 6/23/08 1.054e+06 4.779e+05 2.206 0.027462 *
NameBrownfield Tax Credits - Remediation Real Property Tax Credit 1.327e+05 4.760e+05 0.279 0.780429
NameClean Heating Fuel Credit 2.785e+05 4.595e+05 0.606 0.544549
NameConservation Easement Tax Credit 2.743e+04 4.700e+05 0.058 0.953457
NameCredit for Employment of Persons with Disabilities 1.574e+05 5.069e+05 0.311 0.756142
NameCredit for Purchase of an Automated External Defibrillator 1.082e+05 4.465e+05 0.242 0.808639
NameCredit for Taxicabs & Livery Service Vehicles Accessible to Persons with Disabilities 2.090e+05 8.286e+05 0.252 0.800866
NameEmpire State Apprentice Tax Credit -3.324e+05 1.012e+06 -0.329 0.742537
NameEmpire State Commercial Production Credit 3.524e+05 6.172e+05 0.571 0.568123
NameEmpire State Film Post Production Credit 5.332e+05 5.224e+05 1.021 0.307585
NameEmpire State Film Production Credit 1.170e+07 4.785e+05 24.458 < 2e-16 ***
NameEmpire State Musical and Theatrical Production Credit 7.676e+04 9.010e+05 0.085 0.932112
NameExcelsior Jobs Program Credit 7.086e+05 4.393e+05 1.613 0.106849
NameEZ/QEZE Tax Credits - EZ Investment Tax Credit 1.381e+06 4.318e+05 3.199 0.001399 **
NameEZ/QEZE Tax Credits - EZ Wage Tax Credit 3.715e+05 4.213e+05 0.882 0.377942
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes 1.160e+06 4.192e+05 2.767 0.005700 **
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes For Corporate Partners 3.634e+05 4.311e+05 0.843 0.399359
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit 2.567e+05 4.265e+05 0.602 0.547350
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit For Corporate Partners 7.379e+04 5.041e+05 0.146 0.883634
NameFarm Workforce Retention Credit 2.727e+04 5.352e+05 0.051 0.959361
NameFarmers' School Tax Credit 1.287e+05 5.133e+05 0.251 0.802102
NameHire a Veteran Credit 1.459e+05 8.253e+05 0.177 0.859709
NameHistoric Properties Rehabilitation Credit 1.875e+06 4.841e+05 3.874 0.000110 ***
NameInvestment Tax Credit 7.182e+05 4.129e+05 1.739 0.082075 .
NameInvestment Tax Credit for the Financial Services Industry 6.339e+05 5.757e+05 1.101 0.270913
NameLife Sciences Research & Development Tax Credit -2.243e+03 9.007e+05 -0.002 0.998014
NameLong-Term Care Insurance Credit 1.385e+05 4.200e+05 0.330 0.741537
NameLow-Income Housing Credit 1.642e+06 5.494e+05 2.988 0.002833 **
NameMinimum Wage Reimbursement Credit 9.624e+04 4.381e+05 0.220 0.826159
NameMortgage Servicing Tax Credit 2.077e+05 6.462e+05 0.321 0.747934
NameNew York Youth Jobs Program Tax Credit 1.953e+05 4.273e+05 0.457 0.647604
NameQETC Capital Tax Credit 2.986e+05 5.868e+05 0.509 0.610823
NameQETC Employment Credit 8.982e+04 4.465e+05 0.201 0.840607
NameQETC Facilities, Operations, and Training Credit 3.477e+05 6.711e+05 0.518 0.604441
NameReal Property Tax Relief Credit for Manufacturing 1.520e+05 4.423e+05 0.344 0.731171
NameSpecial Additional Mortgage Recording Tax Credit 2.609e+05 4.432e+05 0.589 0.556155
NameSTART-UP NY Tax Elimination Credit 1.703e+04 4.750e+05 0.036 0.971411
GroupAdministrative and Support and Waste Management and Remediation Services -2.280e+03 2.519e+05 -0.009 0.992777
GroupAdministrative/Support/Waste Management/Remediation Services -3.173e+03 2.798e+05 -0.011 0.990952
GroupAgriculture, Forestry, Fishing and Hunting 7.269e+04 2.330e+05 0.312 0.755097
GroupArts, Entertainment, and Recreation 5.866e+05 2.317e+05 2.532 0.011408 *
GroupConstruction -1.699e+04 2.171e+05 -0.078 0.937621
GroupEducational Services 5.569e+04 2.948e+05 0.189 0.850176
GroupFinance and Insurance 2.139e+05 2.034e+05 1.051 0.293237
GroupHealth Care and Social Assistance 1.300e+04 2.290e+05 0.057 0.954731
GroupInformation -2.433e+05 2.178e+05 -1.117 0.263917
GroupManagement of Companies and Enterprises 3.355e+05 1.949e+05 1.721 0.085332 .
GroupManufacturing 6.786e+05 2.028e+05 3.346 0.000831 ***
GroupMining -1.561e+04 3.593e+05 -0.043 0.965352
GroupMining, Quarrying, and Oil and Gas Extraction 1.084e+05 3.011e+05 0.360 0.718901
GroupOther Services (except Public Administration) -6.462e+04 2.217e+05 -0.291 0.770697
GroupProfessional, Scientific, and Technical Services 3.736e+05 2.085e+05 1.792 0.073214 .
GroupReal Estate and Rental and Leasing 9.506e+04 2.044e+05 0.465 0.641882
GroupRetail Trade 4.746e+04 2.036e+05 0.233 0.815756
GroupTransportation and Warehousing -1.866e+04 2.300e+05 -0.081 0.935321
GroupUtilities 5.308e+05 2.633e+05 2.016 0.043917 *
GroupWholesale Trade 6.106e+04 2.081e+05 0.293 0.769261
Num -9.657e+02 4.272e+02 -2.260 0.023889 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1612000 on 2411 degrees of freedom
Multiple R-squared: 0.4674, Adjusted R-squared: 0.4533
F-statistic: 33.06 on 64 and 2411 DF, p-value: < 2.2e-16
GVIF Df GVIF^(1/(2*Df))
Year 2.190806 1 1.480137
Name 5.185764 42 1.019787
Group 2.724217 20 1.025371
Num 1.370920 1 1.170863
industry.model <- lm(sat.formula, data = industry_cleaned)
Selecting Specific Diagnostic plots for linear models
plot(income.model, which = 1)
plot(income.model, which = 2)
plot(income.model, which = 3)
plot(income.model, which = 5)
Correcting violation of Normality in previous model with BoxCox transform
bc_func <- function (lm.cleaned, lambda.range){
bc = boxCox(lm.cleaned, lambda = lambda.range)
#Extracting the best lambda value.
return(bc$x[which(bc$y == max(bc$y))])
}
#Income Group Dataset
income.lambda.bc = bc_func(income.model, seq(-0.2, 0.2, 1/10))
income.lambda.bc
[1] -0.01414141
#Industry Group Dataset
industry.lambda.bc = bc_func(industry.model, seq(-0.2, 0.2, 1/10))
industry.lambda.bc
[1] -0.03434343
bc_transform <- function(df, lambda.bc){
return (df %>%
mutate(Avg.bc = (Avg^lambda.bc -1)/lambda.bc) %>%
select(-c(Avg))) #took out field Amount
}
#Income Group Dataset
income_cleaned_bc <- bc_transform(income_cleaned, income.lambda.bc)
income.model.bc = lm(Avg.bc ~ ., data = income_cleaned_bc)
#Industry Group Dataset
industry_cleaned_bc <- bc_transform(industry_cleaned, industry.lambda.bc)
industry.model.bc = lm(Avg.bc ~ ., data = industry_cleaned_bc)
Testing out bc_func for migration to Shiny App global.R file
bc_funct <- function (df, lm.cleaned, lambda.range){
bc = boxCox(lm.cleaned, lambda = lambda.range)
lambda.bc = bc$x[which(bc$y == max(bc$y))]
return(df %>%
mutate(Avg.bc = (Avg^lambda.bc -1)/lambda.bc) %>%
select(-c(Avg)))
}
bc_funct(income_cleaned, income.model, seq(-0.2, 0.2, 1/10))
bc_func2 <- function (){
bc = boxCox(lm(Avg ~ ., data = industry_cleaned), lambda = seq(-0.2, 0.2, 1/10))
lambda.bc = bc$x[which(bc$y == max(bc$y))]
return(industry_cleaned %>%
mutate(Avg.bc = (Avg^lambda.bc -1)/lambda.bc) %>%
select(-c(Avg)))
}
industry_cleaned_bc <-bc_func2()
industry_cleaned_bc
Checking linear regression assumptions for the transformed data.
sat.formula.bc <- Avg.bc ~ .
sat.field.bc <- 'Avg.bc'
#Income
sat.model.summary(income_cleaned_bc, sat.field.bc, sat.formula.bc)
Shapiro-Wilk normality test
data: df[[field]]
W = 0.99696, p-value = 0.000782
[1] "kurtosis"
[1] 2.744669
Call:
lm(formula = sat.formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-6.6597 -0.5738 -0.0113 0.5668 4.4826
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -3.634e+01 1.456e+01 -2.496 0.012643 *
Year 2.288e-02 7.214e-03 3.171 0.001543 **
NameAlternative Fuels and Electric Vehicle Recharging Property Credit -8.643e-01 3.281e-01 -2.634 0.008510 **
NameAlternative Minimum Tax Credit -2.052e+00 2.577e-01 -7.962 2.91e-15 ***
NameBeer Production Credit 5.687e-01 3.407e-01 1.669 0.095234 .
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 6/23/08 but before 7/1/15 1.715e+00 2.630e-01 6.521 8.99e-11 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 7/1/15 2.676e+00 3.754e-01 7.130 1.43e-12 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - Prior to 6/23/08 1.331e+00 2.622e-01 5.076 4.24e-07 ***
NameBrownfield Tax Credits - Remediation Real Property Tax Credit 5.458e-02 2.620e-01 0.208 0.834991
NameClean Heating Fuel Credit -3.086e+00 2.705e-01 -11.409 < 2e-16 ***
NameConservation Easement Tax Credit -2.137e+00 2.810e-01 -7.606 4.45e-14 ***
NameCredit for Employment of Persons with Disabilities -3.118e+00 2.956e-01 -10.547 < 2e-16 ***
NameCredit for Purchase of an Automated External Defibrillator -2.938e+00 2.560e-01 -11.473 < 2e-16 ***
NameCredit for Taxicabs & Livery Service Vehicles Accessible to Persons with Disabilities -5.673e-01 4.698e-01 -1.207 0.227430
NameEmpire State Apprentice Tax Credit -2.207e+00 4.024e-01 -5.485 4.71e-08 ***
NameEmpire State Commercial Production Credit 6.075e-02 3.410e-01 0.178 0.858634
NameEmpire State Film Post Production Credit 1.101e+00 2.769e-01 3.977 7.25e-05 ***
NameEmpire State Film Production Credit 2.853e+00 2.632e-01 10.838 < 2e-16 ***
NameEmpire State Musical and Theatrical Production Credit 2.508e-01 4.688e-01 0.535 0.592803
NameExcelsior Jobs Program Credit 4.651e-01 2.521e-01 1.845 0.065178 .
NameEZ/QEZE Tax Credits - EZ Investment Tax Credit 8.876e-01 2.341e-01 3.792 0.000154 ***
NameEZ/QEZE Tax Credits - EZ Wage Tax Credit 1.775e-01 2.433e-01 0.730 0.465701
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes 1.220e+00 2.440e-01 5.000 6.26e-07 ***
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes For Corporate Partners 1.182e-01 2.522e-01 0.469 0.639260
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit -8.497e-01 2.479e-01 -3.427 0.000623 ***
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit For Corporate Partners -1.358e+00 2.756e-01 -4.928 9.02e-07 ***
NameFarm Workforce Retention Credit -1.612e+00 3.394e-01 -4.749 2.20e-06 ***
NameFarmers' School Tax Credit -1.311e+00 2.696e-01 -4.863 1.26e-06 ***
NameHire a Veteran Credit -2.915e+00 4.706e-01 -6.195 7.15e-10 ***
NameHistoric Properties Rehabilitation Credit 1.918e+00 2.786e-01 6.886 7.81e-12 ***
NameIndustrial or Manufacturing Business Tax Credit -1.718e+00 2.836e-01 -6.059 1.66e-09 ***
NameInvestment Tax Credit 7.147e-02 2.349e-01 0.304 0.760985
NameInvestment Tax Credit for the Financial Services Industry 3.176e-01 2.662e-01 1.193 0.232870
NameLife Sciences Research & Development Tax Credit 7.033e-01 5.543e-01 1.269 0.204728
NameLong-Term Care Insurance Credit -2.863e+00 2.590e-01 -11.051 < 2e-16 ***
NameLow-Income Housing Credit -9.063e-01 2.940e-01 -3.083 0.002080 **
NameMinimum Wage Reimbursement Credit -1.241e+00 2.656e-01 -4.674 3.17e-06 ***
NameMortgage Servicing Tax Credit -9.378e-01 2.865e-01 -3.273 0.001084 **
NameNew York Youth Jobs Program Tax Credit -1.330e+00 2.478e-01 -5.367 9.01e-08 ***
NameQETC Capital Tax Credit 4.561e-01 3.661e-01 1.246 0.212956
NameQETC Employment Credit -5.888e-01 2.709e-01 -2.174 0.029855 *
NameQETC Facilities, Operations, and Training Credit 5.799e-01 5.065e-01 1.145 0.252462
NameReal Property Tax Relief Credit for Manufacturing -1.518e+00 2.550e-01 -5.956 3.09e-09 ***
NameSpecial Additional Mortgage Recording Tax Credit -2.309e-01 2.441e-01 -0.946 0.344374
NameSTART-UP NY Tax Elimination Credit -2.193e+00 2.985e-01 -7.345 3.05e-13 ***
Group1,000,000 - 24,999,999 1.090e+00 9.246e-02 11.785 < 2e-16 ***
Group100,000 - 499,999 3.590e-01 9.451e-02 3.798 0.000150 ***
Group100,000,000 - 499,999,999 1.685e+00 1.040e-01 16.202 < 2e-16 ***
Group25,000,000 - 49,999,999 1.325e+00 1.115e-01 11.883 < 2e-16 ***
Group50,000,000 - 99,999,999 1.473e+00 1.118e-01 13.172 < 2e-16 ***
Group500,000 - 999,999 6.032e-01 1.025e-01 5.886 4.67e-09 ***
Group500,000,000 - and over 2.332e+00 1.016e-01 22.953 < 2e-16 ***
GroupZero or Net Loss 9.934e-01 8.817e-02 11.267 < 2e-16 ***
Num -1.533e-03 2.263e-04 -6.775 1.66e-11 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 1.022 on 1867 degrees of freedom
Multiple R-squared: 0.7368, Adjusted R-squared: 0.7293
F-statistic: 98.61 on 53 and 1867 DF, p-value: < 2.2e-16
GVIF Df GVIF^(1/(2*Df))
Year 2.125869 1 1.458036
Name 3.521453 43 1.014746
Group 1.510763 8 1.026124
Num 1.645484 1 1.282764
#Industry
sat.model.summary(industry_cleaned_bc, sat.field.bc, sat.formula.bc)
Shapiro-Wilk normality test
data: df[[field]]
W = 0.9902, p-value = 5.826e-12
[1] "kurtosis"
[1] 2.513097
Call:
lm(formula = sat.formula, data = df)
Residuals:
Min 1Q Median 3Q Max
-6.2888 -0.4697 0.0183 0.4928 3.9068
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -6.095e+01 1.107e+01 -5.505 4.09e-08 ***
Year 3.440e-02 5.488e-03 6.269 4.30e-10 ***
NameAlternative Fuels and Electric Vehicle Recharging Property Credit 2.665e-01 2.733e-01 0.975 0.329737
NameAlternative Minimum Tax Credit -2.463e+00 2.272e-01 -10.839 < 2e-16 ***
NameBeer Production Credit 6.334e-01 3.381e-01 1.873 0.061126 .
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 6/23/08 but before 7/1/15 2.198e+00 2.415e-01 9.100 < 2e-16 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - On or after 7/1/15 2.727e+00 3.930e-01 6.939 5.06e-12 ***
NameBrownfield Tax Credits - Redevelopment Tax Credit - Prior to 6/23/08 1.586e+00 2.581e-01 6.148 9.17e-10 ***
NameBrownfield Tax Credits - Remediation Real Property Tax Credit 9.695e-01 2.570e-01 3.772 0.000166 ***
NameClean Heating Fuel Credit -2.544e+00 2.481e-01 -10.252 < 2e-16 ***
NameConservation Easement Tax Credit -1.123e+00 2.538e-01 -4.423 1.02e-05 ***
NameCredit for Employment of Persons with Disabilities -1.245e+00 2.737e-01 -4.549 5.67e-06 ***
NameCredit for Purchase of an Automated External Defibrillator -1.535e+00 2.411e-01 -6.368 2.29e-10 ***
NameCredit for Taxicabs & Livery Service Vehicles Accessible to Persons with Disabilities -8.171e-02 4.474e-01 -0.183 0.855119
NameEmpire State Apprentice Tax Credit -8.683e-01 5.464e-01 -1.589 0.112125
NameEmpire State Commercial Production Credit 6.235e-01 3.333e-01 1.871 0.061481 .
NameEmpire State Film Post Production Credit 1.439e+00 2.821e-01 5.100 3.66e-07 ***
NameEmpire State Film Production Credit 3.232e+00 2.584e-01 12.509 < 2e-16 ***
NameEmpire State Musical and Theatrical Production Credit 1.012e+00 4.865e-01 2.080 0.037608 *
NameExcelsior Jobs Program Credit 1.643e+00 2.372e-01 6.929 5.42e-12 ***
NameEZ/QEZE Tax Credits - EZ Investment Tax Credit 1.310e+00 2.332e-01 5.617 2.17e-08 ***
NameEZ/QEZE Tax Credits - EZ Wage Tax Credit 7.127e-01 2.275e-01 3.133 0.001750 **
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes 1.707e+00 2.264e-01 7.540 6.61e-14 ***
NameEZ/QEZE Tax Credits - QEZE Credit for Real Property Taxes For Corporate Partners 9.777e-01 2.328e-01 4.200 2.77e-05 ***
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit 1.609e-01 2.303e-01 0.699 0.484921
NameEZ/QEZE Tax Credits - QEZE Tax Reduction Credit For Corporate Partners -5.241e-01 2.722e-01 -1.925 0.054320 .
NameFarm Workforce Retention Credit -8.623e-01 2.890e-01 -2.984 0.002876 **
NameFarmers' School Tax Credit -1.158e+00 2.771e-01 -4.179 3.03e-05 ***
NameHire a Veteran Credit -1.040e+00 4.456e-01 -2.335 0.019638 *
NameHistoric Properties Rehabilitation Credit 2.397e+00 2.614e-01 9.172 < 2e-16 ***
NameInvestment Tax Credit 6.981e-01 2.230e-01 3.131 0.001763 **
NameInvestment Tax Credit for the Financial Services Industry 1.560e+00 3.108e-01 5.019 5.59e-07 ***
NameLife Sciences Research & Development Tax Credit 9.057e-01 4.864e-01 1.862 0.062697 .
NameLong-Term Care Insurance Credit -1.867e+00 2.268e-01 -8.231 3.01e-16 ***
NameLow-Income Housing Credit 1.331e+00 2.967e-01 4.487 7.57e-06 ***
NameMinimum Wage Reimbursement Credit -9.161e-01 2.366e-01 -3.872 0.000111 ***
NameMortgage Servicing Tax Credit 8.932e-01 3.489e-01 2.560 0.010534 *
NameNew York Youth Jobs Program Tax Credit 1.263e-01 2.307e-01 0.548 0.584069
NameQETC Capital Tax Credit 1.162e+00 3.168e-01 3.667 0.000251 ***
NameQETC Employment Credit -3.048e-01 2.411e-01 -1.264 0.206249
NameQETC Facilities, Operations, and Training Credit 1.147e+00 3.624e-01 3.165 0.001571 **
NameReal Property Tax Relief Credit for Manufacturing -6.559e-01 2.388e-01 -2.746 0.006077 **
NameSpecial Additional Mortgage Recording Tax Credit 7.505e-01 2.393e-01 3.136 0.001736 **
NameSTART-UP NY Tax Elimination Credit -1.834e+00 2.565e-01 -7.151 1.14e-12 ***
GroupAdministrative and Support and Waste Management and Remediation Services 2.878e-01 1.360e-01 2.116 0.034417 *
GroupAdministrative/Support/Waste Management/Remediation Services 1.675e-01 1.511e-01 1.109 0.267612
GroupAgriculture, Forestry, Fishing and Hunting -1.132e-01 1.258e-01 -0.899 0.368486
GroupArts, Entertainment, and Recreation 5.595e-01 1.251e-01 4.473 8.09e-06 ***
GroupConstruction -4.299e-02 1.172e-01 -0.367 0.713860
GroupEducational Services 2.253e-01 1.592e-01 1.415 0.157160
GroupFinance and Insurance 5.350e-01 1.099e-01 4.870 1.19e-06 ***
GroupHealth Care and Social Assistance -6.777e-02 1.237e-01 -0.548 0.583720
GroupInformation 7.098e-01 1.176e-01 6.037 1.81e-09 ***
GroupManagement of Companies and Enterprises 6.672e-01 1.052e-01 6.340 2.73e-10 ***
GroupManufacturing 4.608e-01 1.095e-01 4.208 2.67e-05 ***
GroupMining 2.515e-01 1.940e-01 1.296 0.194977
GroupMining, Quarrying, and Oil and Gas Extraction 3.869e-01 1.626e-01 2.379 0.017421 *
GroupOther Services (except Public Administration) -1.550e-01 1.197e-01 -1.295 0.195482
GroupProfessional, Scientific, and Technical Services 4.875e-01 1.126e-01 4.331 1.55e-05 ***
GroupReal Estate and Rental and Leasing 1.461e-01 1.104e-01 1.324 0.185596
GroupRetail Trade 3.754e-01 1.100e-01 3.414 0.000650 ***
GroupTransportation and Warehousing 2.125e-01 1.242e-01 1.711 0.087179 .
GroupUtilities 6.972e-01 1.422e-01 4.904 1.00e-06 ***
GroupWholesale Trade 4.367e-01 1.124e-01 3.886 0.000105 ***
Num 4.542e-04 2.307e-04 1.969 0.049082 *
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 0.8703 on 2411 degrees of freedom
Multiple R-squared: 0.7631, Adjusted R-squared: 0.7569
F-statistic: 121.4 on 64 and 2411 DF, p-value: < 2.2e-16
GVIF Df GVIF^(1/(2*Df))
Year 2.190806 1 1.480137
Name 5.185764 42 1.019787
Group 2.724217 20 1.025371
Num 1.370920 1 1.170863
avPlots(income.model.bc)
NA
NA
NA
NA
NA
BIC comparison before and after BoxCox transform
BIC(income.model.bc, income.model)
BIC(industry.model.bc, industry.model)
Stepwise Regression on Income_cat_bc (boxcox transformed dataset)
clean_colname <- function(cols) {
return(str_replace_all(cols, "[-'/ ,�&()`]", '_'))
}
#creating dummy variable columns for stepwise
dummy_func <- function (df){
x = model.matrix(Avg.bc ~., df)[, -1]
dummy_bc = as.data.frame(x) %>% mutate(Avg.bc = df$Avg.bc)
colnames(dummy_bc) <- clean_colname(colnames(dummy_bc))
#colnames(dummy_bc) <- str_replace_all(colnames(dummy_bc), "[-'/ ,�&()`]", '_')
return(dummy_bc)
}
Cleaning column names further so stepwise regression doesn’t present any errors
#Income Group Dataset
income.dummy.bc <- dummy_func(income_cleaned_bc)
#colnames(income.dummy.bc)[37] <- 'NameManufactureru0092s_Real_Property_Tax_Credit'
colnames(income.dummy.bc)
#Industry Group Dataset
industry.dummy.bc <- dummy_func(industry_cleaned_bc)
#colnames(industry.dummy.bc)[35] <- 'NameManufactureru0092s_Real_Property_Tax_Credit'
colnames(industry.dummy.bc)
Stepwise regression using BIC as the criteria (k = log(n)).
#Creating Stepwise Models
bcs = list(income = income.dummy.bc, industry = industry.dummy.bc)
model.fulls = list(income = lm(Avg.bc ~ ., data = income.dummy.bc), industry = lm(Avg.bc ~ ., data = industry.dummy.bc)) #All variables
model.emptys = list(income = lm(Avg.bc ~ 1, data = income.dummy.bc), industry = lm(Avg.bc ~ 1, data = industry.dummy.bc)) #intercept only
k = c('income', 'industry')
forwardBIC = list(income = NULL, industry = NULL)
backwardBIC = list(income = NULL, industry = NULL)
for (i in k){
bc = bcs[[i]]
scope = list(lower = formula(model.emptys[[i]]), upper = formula(model.fulls[[i]]))
n_obs = bc %>% dplyr::count() %>% dplyr::first()
forwardBIC[[i]] = step(model.emptys[[i]], scope, direction = "forward", k = log(n_obs))
backwardBIC[[i]] = step(model.fulls[[i]], scope, direction = "backward", k = log(n_obs))
}
Selecting Best Formula per Dataset from Stepwise Regressions
bic_func <- function (BIC.model){
print('Adjusted R Squared:')
print(summary(BIC.model)$adj.r.squared)
print('Number of Coefficients:')
print(dim(summary(BIC.model)$coefficient)[1])
print('VIF Check: ')
print(max(vif(BIC.model)))
print("*************************")
return(c(summary(BIC.model)$adj.r.squared, dim(summary(BIC.model)$coefficient)[1], max(vif(BIC.model))))
}
as.data.frame(rbind(c('income', 'forward', bic_func(forwardBIC[['income']])),
c('income', 'backward', bic_func(backwardBIC[['income']])),
c('industry', 'forward', bic_func(forwardBIC[['industry']])),
c('industry', 'backward', bic_func(backwardBIC[['industry']]))
)
) %>% select(dataset = V1, Stepwise = V2, `Adjusted R^2` = V3, `Number of Coefficients` = V4, `Maximum VIF` = V5) %>% write_csv('Shiny_app/data/stepwiseBIC_results.csv')
bic_func(forwardBIC[['income']])
bic_func(backwardBIC[['income']])
bic_func(forwardBIC[['industry']])
bic_func(backwardBIC[['industry']])
#Manual reduction of variables using VIF and then checked versus saturated model with Anova. This was not used because the saturated model contained multicollinearity issues as indicated by a high VIF score on some coefficients. And the anova test suggested that the coefficients removed in the reduced model were informative in our model, so we couldn't use it either. Thus Stepwise reduction is the preferred method for best model fit.
# VIF.variables <- as.data.frame(vif(model.fulls[['industry']])) %>%
# select(VIF = `vif(model.fulls[["industry"]])`) %>%
# filter(VIF > 5) %>% rownames()
#
# industry.dummy.bc.VIF <- industry.dummy.bc %>% select(-all_of(VIF.variables))
# industry.model.VIF <- lm(Avg.bc ~ ., data = industry.dummy.bc.VIF)
# summary(industry.model.VIF)
# anova(industry.model.VIF, model.fulls[['industry']])
Best Model Selection from Stepwise
#The Best Models selected for both income and industry were forwardBIC.
#Income Group Dataset
income.best.formula <- forwardBIC[['income']]$call[[2]]
income.best.formula
#Industry Group Dataset
industry.best.formula <- forwardBIC[['industry']]$call[[2]]
industry.best.formula
Splitting data up into test data and training data (test data is for year 2019, training is the rest)
test_train_split <- function(dummy_bc, best.formula) {
# data.test <- dummy_bc %>% filter(Year == 2019)
# data.train <- dummy_bc %>% filter(Year != 2019)
X <- model.matrix(best.formula, data = dummy_bc)[,-1]
y <- as.matrix(dummy_bc %>% select(all.vars(best.formula)[1]))
set.seed(0)
train.i = sample(1:nrow(dummy_bc), 0.8*nrow(dummy_bc), replace = F)
#train
X.train <- X[train.i,]
y.train <- y[train.i,]
#test
X.test <- X[-train.i,]
y.test <- y[-train.i,]
data.train <- as.data.frame(cbind(y.train, X.train))
data.test <- as.data.frame(cbind(y.test, X.test))
colnames(data.train)[1] = all.vars(best.formula)[1]
colnames(data.test)[1] = all.vars(best.formula)[1]
return (list('X.train' = X.train, 'y.train' = y.train, 'X.test' = X.test, 'y.test' = y.test, 'data.train' = data.train, 'data.test' = data.test, 'data' = dummy_bc))
}
Lasso regression for comparison to Forward Stepwise and Function to show metrics (R^2 and MSE) for Regularization (Ridge/Lasso)
regularization_func <- function (data, alpha, name){
X.train <- data[['X.train']]
y.train <- data[['y.train']]
X.test <- data [['X.test']]
y.test <- data[['y.test']]
data.test <- data[['data.test']]
# print(colnames(X.train))
# print(colnames(X.test))
# print(setdiff(colnames(X.train), colnames(X.test)))
#create lambda grid
lambda.grid = 10^seq(10, -10, length = 100)
#create lasso models with lambda.grid
lasso.models = glmnet(X.train, y.train, alpha = alpha, lambda = lambda.grid)
#visualize coefficient shrinkage
# plot(lasso.models, xvar = "lambda", label = TRUE, main = paste("Lasso Regression:", i))
#Cross Validation to find best lambda
set.seed(0)
cv.lasso.models <- cv.glmnet(X.train, y.train, alpha = alpha, lambda = lambda.grid, nfolds = 10)
#visualize cross validation for lambda that minimizes the mean squared error.
plot(cv.lasso.models, main = paste("Alpha:", alpha, "Regression:", name))
#Checking the best lambda
# log(cv.lasso.models$lambda.min)
# best.lambda <- cv.lasso.models$lambda.min
# print(paste(i, ' best.lambda:', best.lambda))
# best lambda with all the variables was found to be 0.0006892612
# best lambda with only the bwdBIC coefficients included was found to be 0.0003053856
#looking at the lasso coefficients for the best.lambda
# best.lambda.coeff <- predict(lasso.models, s = cv.lasso.models$lambda.min, type = "coefficients")
# print('Number of Coefficients:')
# print(dim(best.lambda.coeff)[1])
#fitting a model with the best lambda found to be 0.000689 and using it to make predictions for the testing data.
lasso.best.lambda.train.pred <- predict(lasso.models, s = cv.lasso.models$lambda.min, newx = X.test)
lasso.best.lambda.train.pred
#checking MSE
MSE.lasso <- mean((lasso.best.lambda.train.pred - y.test)^2)
print(paste('Dimensions of the Alpha:', alpha, ' Regression Coefficients for:', name))
print(dim(coef(lasso.models))[1])
p = dim(coef(lasso.models))[1]
return(eval_results(y.test, lasso.best.lambda.train.pred, data.test, p)['Rsquare'])
}
Function for calculating adjusted R squared
# Calculate R squared from true values and predictions
eval_results <- function(true, predicted, df, p) {
n = nrow(df)
adj.RSS <- sum((predicted - true)^2)/(n-p-1)
adj.TSS <- sum((true - mean(true))^2)/(n-1)
adj.R_square <- 1 - adj.RSS / adj.TSS
#RMSE = sqrt(RSS/n)
return(c(Rsquare = adj.R_square))
}
Creates data structure that stores formulas and full,train, and test datasets for final model metric comparison
formulas <- list(income_cleaned = sat.formula,
income_cleaned_bc = sat.formula.bc,
income.data.split.sat = sat.formula.bc,
income.data.split.best = income.best.formula,
industry_cleaned = sat.formula,
industry_cleaned_bc = sat.formula.bc,
industry.data.split.sat = sat.formula.bc,
industry.data.split.best = industry.best.formula)
all.splits <- list(
'income_cleaned' = test_train_split(income_cleaned, formulas[['income_cleaned']]),
'income_cleaned_bc' = test_train_split(income_cleaned_bc, formulas[['income_cleaned_bc']]),
'income.data.split.sat' = test_train_split(income.dummy.bc, formulas[['income.data.split.sat']]),
'income.data.split.best' = test_train_split(income.dummy.bc, formulas[['income.data.split.best']]),
'industry_cleaned' = test_train_split(industry_cleaned, formulas[['industry_cleaned']]),
'industry_cleaned_bc' = test_train_split(industry_cleaned_bc, formulas[['industry_cleaned_bc']]),
'industry.data.split.sat' = test_train_split(industry.dummy.bc, formulas[['industry.data.split.sat']]),
'industry.data.split.best' = test_train_split(industry.dummy.bc, formulas[['industry.data.split.best']])
)
regularization_func(all.splits[['income_cleaned']], 0, 'income_cleaned')
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 0 Regression Coefficients for: income_cleaned"
[1] 54
Rsquare
-0.01019062
Creating models for rsquared comparison
#For loops initialization
no.reg.r2 <- c()
lasso.reg.r2 <- c()
ridge.reg.r2 <- c()
for (i in names(all.splits)) {
#no regularization
m = lm(formulas[[i]], all.splits[[i]][['data.train']])
#no.reg.r2 <- c(no.reg.r2, summary(m)$adj.r.squared)
y.predict = predict(m, newdata = as.data.frame(all.splits[[i]][['X.test']]))
adj.R2 <- eval_results(all.splits[[i]][['y.test']], y.predict, all.splits[[i]][['data.test']], length(coef(m)))['Rsquare']
no.reg.r2 <- c(no.reg.r2, adj.R2)
#lasso regularization
lasso.reg.r2 <- c(lasso.reg.r2, regularization_func(all.splits[[i]], 1, i))
#ridge regularization
ridge.reg.r2 <- c(ridge.reg.r2, regularization_func(all.splits[[i]], 0, i))
}
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 1 Regression Coefficients for: income_cleaned"
[1] 54
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 0 Regression Coefficients for: income_cleaned"
[1] 54
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 1 Regression Coefficients for: income_cleaned_bc"
[1] 54
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 0 Regression Coefficients for: income_cleaned_bc"
[1] 54
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 1 Regression Coefficients for: income.data.split.sat"
[1] 54
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 0 Regression Coefficients for: income.data.split.sat"
[1] 54
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 1 Regression Coefficients for: income.data.split.best"
[1] 41
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 0 Regression Coefficients for: income.data.split.best"
[1] 41
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 1 Regression Coefficients for: industry_cleaned"
[1] 65
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 0 Regression Coefficients for: industry_cleaned"
[1] 65
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 1 Regression Coefficients for: industry_cleaned_bc"
[1] 65
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 0 Regression Coefficients for: industry_cleaned_bc"
[1] 65
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 1 Regression Coefficients for: industry.data.split.sat"
[1] 65
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 0 Regression Coefficients for: industry.data.split.sat"
[1] 65
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 1 Regression Coefficients for: industry.data.split.best"
[1] 42
Warning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' valuesWarning: collapsing to unique 'x' values
Warning: collapsing to unique 'x' values
[1] "Dimensions of the Alpha: 0 Regression Coefficients for: industry.data.split.best"
[1] 42
Adjusted R squared table for model comparison between Lasso, Ridge and No reg on cleaned, cleaned_bc, data.split.sat and data.split.best
df.rsquare <- as.data.frame(cbind('No Regularization' = no.reg.r2, 'Lasso' = lasso.reg.r2, 'Ridge' = ridge.reg.r2))
rownames(df.rsquare) = names(all.splits)
df.rsquare %>% tibble::rownames_to_column(var = 'Dataset') #%>% write_csv('Shiny_app/data/final_rsquare_table.csv')
Generating predictions using best model
Creating key dataframe for changes between user input colnames and dummy dataset colnames
user_input_pred %>% sapply(boxcox_to_dollars, USE.NAMES = T)
[1] 57857.47 45264.94 74016.44
For loop to create Master dataset with Industry and Income for true and predicted values of Avg.bc and Avg in Dollars
#For loop initialization list
joined.true.pred <- list('income' = NULL, 'industry' = NULL)
lambda.bcs <- list('income' = income.lambda.bc, 'industry' = industry.lambda.bc)
for (i in c('income', 'industry')) {
data.index <- paste0(i, '.data.split.best')
m <- lm(formulas[[data.index]], data = all.splits[[data.index]][['data']])
#Making predictions using best model on our full dataset
m.predict <- predict(m, newdata = all.splits[[data.index]][['data']])
#Converting those target variable predictions from BoxCox transform back to Dollars
lambda.bc <- lambda.bcs[[i]]
predictions.df <- as.data.frame(m.predict) %>%
mutate(`Predicted Average in Dollars` = (m.predict*lambda.bc+1)^(1/lambda.bc)) %>%
tibble::rownames_to_column(var = 'Index')
#Loading true values for join with predictions
true.df <- all.splits[[paste0(i, '_cleaned_bc')]][['data']] %>% tibble::rownames_to_column(var = 'Index')
#Joining the predicted data with the true dataset
joined.true.pred[[i]] <- inner_join(predictions.df, true.df, by = 'Index')
#Creating Master dataset with True Avg.bc, Predicted Avg.bc, True Average in Dollars and Predicted Average in Dollars for each Year,
#Credit Name and Income Group entry.
joined.true.pred[[i]] <- joined.true.pred[[i]] %>%
mutate(`True Average in Dollars` = (Avg.bc*lambda.bc+1)^(1/lambda.bc)) %>%
relocate(`True Average in Dollars`, .before = `Predicted Average in Dollars`) %>%
relocate(Avg.bc, .before = m.predict) %>%
select(1:9,
`Predicted Avg.bc` = m.predict,
`True Avg.bc` = Avg.bc)
}
joined.true.pred[['income']]
joined.true.pred[['industry']]
combined.true.pred <- rbind(joined.true.pred[['income']] %>% mutate(dataset = 'income'), joined.true.pred[['industry']] %>% mutate(dataset = 'industry'))
combined.true.pred
Writing combined predicted and true values dataset to a csv file
combined.true.pred <- combined.true.pred %>% write_csv('Shiny_app/data/combined.true.pred.csv')
Visualizations of predictions and true values
Number of taxpayers per Group for a given Credit Name and Year input (Inputs in the filter)
income_cleaned %>% filter(Name == 'Investment Tax Credit', Year == 2019) %>% group_by(Year, Group) %>% summarise(sum(Num))
`summarise()` has grouped output by 'Year'. You can override using the `.groups` argument.